# Packages
library(tidyverse)
library(lubridate)
library(plotly)
library(sqldf)
library(corrplot)
library(rpart)
library(caret)
library(rpart.plot)
library(party)
library(randomForest)
library(e1071)
library(neuralnet)
# Loading dataset
bikes <- read.csv("SeoulBikeData.csv", header = TRUE, sep = ",")
str(bikes)
## 'data.frame': 8760 obs. of 14 variables:
## $ Date : Factor w/ 365 levels "01/01/2018","01/02/2018",..: 12 12 12 12 12 12 12 12 12 12 ...
## $ Rented.Bike.Count : int 254 204 173 107 78 100 181 460 930 490 ...
## $ Hour : int 0 1 2 3 4 5 6 7 8 9 ...
## $ Temperature..C. : num -5.2 -5.5 -6 -6.2 -6 -6.4 -6.6 -7.4 -7.6 -6.5 ...
## $ Humidity... : int 37 38 39 40 36 37 35 38 37 27 ...
## $ Wind.speed..m.s. : num 2.2 0.8 1 0.9 2.3 1.5 1.3 0.9 1.1 0.5 ...
## $ Visibility..10m. : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 1928 ...
## $ Dew.point.temperature..C.: num -17.6 -17.6 -17.7 -17.6 -18.6 -18.7 -19.5 -19.3 -19.8 -22.4 ...
## $ Solar.Radiation..MJ.m2. : num 0 0 0 0 0 0 0 0 0.01 0.23 ...
## $ Rainfall.mm. : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Snowfall..cm. : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Seasons : Factor w/ 4 levels "Autumn","Spring",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Holiday : Factor w/ 2 levels "Holiday","No Holiday": 2 2 2 2 2 2 2 2 2 2 ...
## $ Functioning.Day : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
summary(bikes)
## Date Rented.Bike.Count Hour Temperature..C.
## 01/01/2018: 24 Min. : 0.0 Min. : 0.00 Min. :-17.80
## 01/02/2018: 24 1st Qu.: 191.0 1st Qu.: 5.75 1st Qu.: 3.50
## 01/03/2018: 24 Median : 504.5 Median :11.50 Median : 13.70
## 01/04/2018: 24 Mean : 704.6 Mean :11.50 Mean : 12.88
## 01/05/2018: 24 3rd Qu.:1065.2 3rd Qu.:17.25 3rd Qu.: 22.50
## 01/06/2018: 24 Max. :3556.0 Max. :23.00 Max. : 39.40
## (Other) :8616
## Humidity... Wind.speed..m.s. Visibility..10m. Dew.point.temperature..C.
## Min. : 0.00 Min. :0.000 Min. : 27 Min. :-30.600
## 1st Qu.:42.00 1st Qu.:0.900 1st Qu.: 940 1st Qu.: -4.700
## Median :57.00 Median :1.500 Median :1698 Median : 5.100
## Mean :58.23 Mean :1.725 Mean :1437 Mean : 4.074
## 3rd Qu.:74.00 3rd Qu.:2.300 3rd Qu.:2000 3rd Qu.: 14.800
## Max. :98.00 Max. :7.400 Max. :2000 Max. : 27.200
##
## Solar.Radiation..MJ.m2. Rainfall.mm. Snowfall..cm. Seasons
## Min. :0.0000 Min. : 0.0000 Min. :0.00000 Autumn:2184
## 1st Qu.:0.0000 1st Qu.: 0.0000 1st Qu.:0.00000 Spring:2208
## Median :0.0100 Median : 0.0000 Median :0.00000 Summer:2208
## Mean :0.5691 Mean : 0.1487 Mean :0.07507 Winter:2160
## 3rd Qu.:0.9300 3rd Qu.: 0.0000 3rd Qu.:0.00000
## Max. :3.5200 Max. :35.0000 Max. :8.80000
##
## Holiday Functioning.Day
## Holiday : 432 No : 295
## No Holiday:8328 Yes:8465
##
##
##
##
##
#### Descriptive analysis ####
# Unique variables of humidity
c(unique(bikes["Humidity..."]))
## $Humidity...
## [1] 37 38 39 40 36 35 27 24 21 23 25 26 54 58 66 77 79 81 83 84 87 86 82 68 57
## [26] 49 41 48 51 53 52 55 56 69 71 73 75 91 92 89 85 76 90 88 47 30 29 32 43 45
## [51] 44 42 34 33 31 28 46 59 78 70 64 60 94 93 96 65 50 74 63 61 72 62 22 67 80
## [76] 95 15 20 17 18 16 19 14 97 98 10 13 12 11 0
# Unique variables of hour
c(unique(bikes["Hour"]))
## $Hour
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
# Transforming factor date column into date
bikes$Date <- as.POSIXct(bikes$Date, format = "%d/%m/%Y")
bikes$Date <- as.Date(bikes$Date)
# Bikes rented over time plot
a <- ggplot(data = bikes) +
geom_line(aes(x = Date, y = Rented.Bike.Count),
color = "#09557f",
alpha = 0.6,
size = 0.6) +
theme_minimal() +
labs(x = "Date",
y = "Bikes" ,
title = "Base Plot")
a <- ggplotly(a)
a
#### Exploratory analysis ####
# Bikes rented per season plot
b <- ggplot(data = bikes) +
geom_line(aes(x = Date, y = Rented.Bike.Count, col = Seasons),
alpha = 0.6,
size = 0.6) +
scale_color_manual(labels = c("Autumn", "Spring", "Summer", "Winter"),
values = c("red", "green", "yellow", "blue")) +
theme_minimal() +
labs(x = "Date",
y = "Bikes" ,
title = "Bikes rented per seasons ")
b <- ggplotly(b)
b
# Bikes rented overtime in holidays and normal days plot
c <- ggplot(data = bikes) +
geom_line(aes(x = Date, y = Rented.Bike.Count, col = Holiday),
alpha = 0.6,
size = 0.6) +
theme_minimal() +
labs(x = "Date",
y = "Bikes" ,
title = "Bikes rented holiday and normal days")
c <- ggplotly(c)
c
# Rainfall and solar radiation in different seasons in the bike shops plot
d <- ggplot(data = bikes) +
geom_line(aes(x = Date, y = Rainfall.mm., col = Seasons),
alpha = 0.6,
size = 0.6) +
geom_line(aes(x = Date, y = Solar.Radiation..MJ.m2.),
alpha = 0.6,
size = 0.6, color = "orange") +
scale_color_manual(labels = c("Autumn", "Spring", "Summer", "Winter"),
values = c("red", "green", "yellow", "blue")) +
theme_minimal() +
labs(x = "Date",
y = "Rainfall mm and solar radiation" ,
title = "Rainfall and solar radiation in different seasons in the bike shops")
d <- ggplotly(d)
d
e <- ggplot(data = bikes) +
geom_line(aes(x = Date, y = Temperature..C., col = Seasons),
alpha = 0.6,
size = 0.6) +
geom_line(aes(x = Date, y = Wind.speed..m.s.),
alpha = 0.6,
size = 0.6, color = "grey") +
scale_color_manual(labels = c("Autumn", "Spring", "Summer", "Winter"),
values = c("red", "green", "yellow", "blue")) +
theme_minimal() +
labs(x = "Date",
y = "Temperature in C and wind speed" ,
title = "Temperature and wind speed in different seasons in the bike shops")
e <- ggplotly(e)
e
#### Pre processing ####
# Attribute enginnering the seasons column
# transforming all seasons in categorical variables
all_seasons_col = 1 # Winter
all_seasons_col = 2 # Spring
all_seasons_col = 3 # Summer
all_seasons_col = 4 # Autumn
# sorting winter only column
winter_s <- sqldf(
"SELECT * FROM bikes WHERE Seasons LIKE '%Winter' "
)
winter_s <- cbind(winter_s, all_seasons_col)
# sorting spring only column
Spring_s <- sqldf(
"SELECT * FROM bikes WHERE Seasons LIKE '%Spring' "
)
Spring_s <- cbind(Spring_s, all_seasons_col)
# sorting autumn only column
Autumn_S <- sqldf(
"SELECT * FROM bikes WHERE Seasons LIKE '%Autumn' "
)
Autumn_S <- cbind(Autumn_S, all_seasons_col)
# sorting summer only column
Summer_s <- sqldf(
"SELECT * FROM bikes WHERE Seasons LIKE '%Summer' "
)
Summer_s <- cbind(Summer_s, all_seasons_col)
# merging all seasons in the same dataframe
all_seasons <- merge(winter_s, Spring_s, all = TRUE)
all_seasons2 <- merge(Autumn_S, Summer_s, all = TRUE)
all_seasons_final <- merge(all_seasons, all_seasons2, all = TRUE)
# variables correlations plot for feature selection
all_seasons_final$Date <- NULL
all_seasons_final$Seasons<- NULL
all_seasons_final$Holiday <- NULL
all_seasons_final$Functioning.Day <- NULL
correlations <- cor(all_seasons_final,method="pearson")
## Warning in cor(all_seasons_final, method = "pearson"): o desvio padrão é zero
corrplot(correlations, number.cex = .9, method = "circle", type = "full", tl.cex=0.8,tl.col = "black",
title = "bikes")

# removing not important columns for machine learning
all_seasons_final$Functioning.Day <- NULL
all_seasons_final$Hour <- NULL
all_seasons_final$Dew.point.temperature..C. <- NULL
# converting some columns variables types
all_seasons_final$Visibility..10m. <- as.numeric(all_seasons_final$Visibility..10m.)
all_seasons_final$Humidity... <- as.numeric(all_seasons_final$Humidity...)
all_seasons_final$Rented.Bike.Count <- as.numeric(all_seasons_final$Rented.Bike.Count)
# separating data into train and test
indexes <- sample(1:nrow(all_seasons_final), size = 0.7 * nrow(all_seasons_final))
train.data.bikes <- all_seasons_final[indexes,]
test.data.bikes <- all_seasons_final[-indexes,]
class(train.data.bikes)
## [1] "data.frame"
class(test.data.bikes)
## [1] "data.frame"
str(train.data.bikes)
## 'data.frame': 6132 obs. of 9 variables:
## $ Rented.Bike.Count : num 1076 561 490 79 322 ...
## $ Temperature..C. : num 30.7 22.2 23.7 -3.6 28.3 -9 1.3 2.9 3 27.3 ...
## $ Humidity... : num 55 84 76 78 84 54 67 45 26 49 ...
## $ Wind.speed..m.s. : num 1.5 1.1 1.9 1.6 0.5 0.6 1.2 1.6 2 0.9 ...
## $ Visibility..10m. : num 2000 482 1281 471 1006 ...
## $ Solar.Radiation..MJ.m2.: num 0 0 0.08 0 0 0 0 0.04 1.01 1.17 ...
## $ Rainfall.mm. : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Snowfall..cm. : num 0 0 0 0 0 0 0 0 0 0 ...
## $ all_seasons_col : num 4 4 4 4 4 4 4 4 4 4 ...
#### Machine learning ####
# Creating linear regression machine learning model
modelo <- lm(Rented.Bike.Count ~. , data = train.data.bikes)
# prevision for the testing data
prevision1 <- predict(modelo, test.data.bikes)
## Warning in predict.lm(modelo, test.data.bikes): prediction from a rank-deficient
## fit may be misleading
summary(modelo)
##
## Call:
## lm(formula = Rented.Bike.Count ~ ., data = train.data.bikes)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1382.44 -301.81 -50.63 221.97 2358.07
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 874.66896 43.88692 19.930 <2e-16 ***
## Temperature..C. 35.77723 0.66219 54.029 <2e-16 ***
## Humidity... -11.04399 0.48269 -22.880 <2e-16 ***
## Wind.speed..m.s. 59.16871 6.80406 8.696 <2e-16 ***
## Visibility..10m. -0.01271 0.01299 -0.978 0.3281
## Solar.Radiation..MJ.m2. -115.22038 9.97684 -11.549 <2e-16 ***
## Rainfall.mm. -57.34208 6.29146 -9.114 <2e-16 ***
## Snowfall..cm. 39.25025 15.86454 2.474 0.0134 *
## all_seasons_col NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 496.8 on 6124 degrees of freedom
## Multiple R-squared: 0.4078, Adjusted R-squared: 0.4072
## F-statistic: 602.5 on 7 and 6124 DF, p-value: < 2.2e-16
# making plot of real and predictive data
x = 1:length(test.data.bikes$Rented.Bike.Count)
plot(x, test.data.bikes$Rented.Bike.Count, col = "red", type = "l", lwd=2,
main = "Bikes selling real and predictive data (linear regression)")
lines(x, prevision1, col = "blue", lwd=2)
legend("topleft", legend = c("real data", "predictive data"),
fill = c("red", "blue"), col = 2:3, adj = c(0, 0.6))
grid()

# decision tree model
ma_model_tree <- rpart(Rented.Bike.Count ~.,
data = train.data.bikes)
rpart.plot(ma_model_tree)

pred_model <- predict(ma_model_tree, test.data.bikes)
printcp(ma_model_tree)
##
## Regression tree:
## rpart(formula = Rented.Bike.Count ~ ., data = train.data.bikes)
##
## Variables actually used in tree construction:
## [1] Humidity... Solar.Radiation..MJ.m2. Temperature..C.
##
## Root node error: 2552796844/6132 = 416307
##
## n= 6132
##
## CP nsplit rel error xerror xstd
## 1 0.240967 0 1.00000 1.00020 0.021192
## 2 0.099732 1 0.75903 0.76114 0.016797
## 3 0.028707 2 0.65930 0.66623 0.015415
## 4 0.027141 3 0.63059 0.64896 0.014841
## 5 0.019555 4 0.60345 0.61083 0.014225
## 6 0.018036 5 0.58390 0.59219 0.013955
## 7 0.015525 6 0.56586 0.59105 0.013974
## 8 0.010000 7 0.55034 0.56484 0.013858
# making plot of real and predictive data
x = 1:length(test.data.bikes$Rented.Bike.Count)
plot(x, test.data.bikes$Rented.Bike.Count, col = "red", type = "l", lwd=2,
main = "Bikes selling real and predictive data (decision tree)")
lines(x, pred_model, col = "blue", lwd=2)
legend("topleft", legend = c("real data", "predictive data"),
fill = c("red", "blue"), col = 2:3, adj = c(0, 0.6))
grid()

# Randomforest model
ma_model <- randomForest(Rented.Bike.Count ~., data = train.data.bikes)
print(ma_model)
##
## Call:
## randomForest(formula = Rented.Bike.Count ~ ., data = train.data.bikes)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 182022.6
## % Var explained: 56.28
plot(ma_model)

summary(ma_model)
## Length Class Mode
## call 3 -none- call
## type 1 -none- character
## predicted 6132 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 6132 -none- numeric
## importance 8 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 6132 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
pred_model <- predict(ma_model, test.data.bikes)
summary(ma_model)
## Length Class Mode
## call 3 -none- call
## type 1 -none- character
## predicted 6132 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 6132 -none- numeric
## importance 8 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 6132 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
# making plot of real and predictive data
x = 1:length(test.data.bikes$Rented.Bike.Count)
plot(x, test.data.bikes$Rented.Bike.Count, col = "red", type = "l", lwd=2,
main = "Bikes selling real and predictive data (random forest)")
lines(x, pred_model, col = "blue", lwd=2)
legend("topleft", legend = c("real data", "predictive data"),
fill = c("red", "blue"), col = 2:3, adj = c(0, 0.6))
grid()

# knn control archive
ctrl <- trainControl(method = "repeatedcv", repeats = 3)
# knn model
knn_v1 <- train(Rented.Bike.Count ~ .,
data = train.data.bikes,
method = "knn",
trControl = ctrl,
tuneLength = 20)
knn_v1
## k-Nearest Neighbors
##
## 6132 samples
## 8 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 5520, 5518, 5519, 5518, 5518, 5518, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 503.1865 0.4032724 355.5974
## 7 496.9177 0.4119199 353.5838
## 9 491.1785 0.4223761 352.1396
## 11 490.4463 0.4233921 353.5994
## 13 489.4242 0.4254915 354.3774
## 15 490.7260 0.4226356 356.4222
## 17 492.5776 0.4185130 358.6990
## 19 494.4054 0.4145140 361.1090
## 21 496.1044 0.4108650 363.5777
## 23 497.4366 0.4080626 365.3939
## 25 498.7921 0.4051974 367.0448
## 27 499.9006 0.4029197 368.3748
## 29 501.6528 0.3990586 370.4802
## 31 503.2077 0.3956401 372.1997
## 33 504.7390 0.3922293 374.0067
## 35 506.3056 0.3885574 375.6480
## 37 507.6085 0.3855883 377.3169
## 39 508.7033 0.3831809 378.9777
## 41 510.0202 0.3800894 380.4712
## 43 511.1313 0.3775884 381.8409
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
pred_model <- predict(knn_v1, test.data.bikes)
# making plot of real and predictive data
x = 1:length(test.data.bikes$Rented.Bike.Count)
plot(x, test.data.bikes$Rented.Bike.Count, col = "red", type = "l", lwd=2,
main = "Bikes selling real and predictive data (knn)")
lines(x, pred_model, col = "blue", lwd=2)
legend("topleft", legend = c("real data", "predictive data"),
fill = c("red", "blue"), col = 2:3, adj = c(0, 0.6))
grid()
